\documentclass{article}
\usepackage{savesym}
\usepackage{amsmath}
\usepackage{cases}
\usepackage{amssymb}
\usepackage[T1]{fontenc}
\usepackage[ansinew]{inputenc}
\usepackage{enumitem}
\usepackage{graphicx}
\usepackage{color}
\usepackage[cc]{titlepic}
\usepackage{tabularx}
\usepackage{lipsum,appendix}

\newcommand{\RZ}{\mathbb{R}}
\newcommand{\NZ}{\mathbb{N}}

\begin{document}

\begin{abstract}
Our abstract goes here...
...
\end{abstract}

\section{Introduction}
The nonparametric regression problem consists of learning an arbitrary mapping $h$: $\mathcal{X} \rightarrow \mathcal{Y}$ from a data set $(X_i,Y_i)_{i=1,2,...,n}$ where the $Y_i$ values are corrupted by noise of mean zero. If the regression function $h$ is in some sense $p$ times continuously differentiable (e.g. $h$ is $\alpha$ h\"older [GKKW02]), it is knowns that the optimal rate of convergence achieved in this case is $n^{-\frac{2p}{2p+D}}$, where $D$ is the dimensionality of the predictor vector $X$. One can see that performance decreases when $D$ increases. This is the so called curse of dimensionality.\\ Modern Data are very often high dimensional, in image retrieval or text classification for instance. In the case that the predictor variables live on or close to a lower dimensional structure (a manifold for instance), one tries procedures that adapt to this low dimensional structure.\\
 In their recent work [DF08] Dasgupta and Freund present a randomized variant of the $k$-d tree the so called Random Projection tree (or RP tree) which automatically adapts to intrinsic low dimensional structure in data without having to explicitly learn this structure. \\
 We study in this paper a RP tree-based regressor whose convergence rate depends on this actual dimension $d\ll D$ under the assumption that the predictor vector has {\em{doubling dimension}} $d$.

\subsection{Intrinsic dimensionality}
The notion of intrinsic dimension of data is typically used to mean that the data lie near a low dimensional structure. Here are some formulations: {\em{Assouad}} (or {\em{doubling}}) {\em{dimension}}, {\em{manifold dimension}}, {\em{affine dimension}}... In this paper, we adopt the notion of {\em{doubling dimension}}, which is defined as:
\paragraph{Definition 1.} {\em{For any point $x\in \mathcal{R}^{D}$ and any $r>0$, let  $B(x,r)=\{z:\|x-z\|\leq r\}$ denote the closed ball of radius $r$ centered at $x$}}.
{\em{The doubling dimension of $\mathcal{X} \subset \RZ^{D}$ is the smallest $d$ such that for any ball $B \subset \RZ^{D}$, the set $B\cap\mathcal{X}$ can be covered by $2^d$ balls of radius $r/2$}}.\\

The {\em{doubling dimension}} is particularly attractive in the sense that it generalizes the notion of {\em{manifold dimension}}, the notion of {\em{affine dimension}} and the notion of sparsity (see for instance [Kpo11]), while at the same time being amenable to the kinds of analysis that arise in algorithm design.

Is $\mathcal{X}$ a line in $\RZ^{D}$ for instance, then $\mathcal{X}\cap B$ is a line segment for any $B \in \RZ^{D}$ and can be covered by two balls whose radii are half that of $B$. Hence the doubling dimension of $\mathcal{X}$ is $1$.

%\subsection{RP tree and tree-based regression}

%\subsubsection{RP tree}
%A RP tree is a randomized variant of $k$-d tree precisely, instead of splitting along coordinate directions at the median, Dasgupta and Freund split along a random direction in $S^{D-1}$ (the unit sphere in $\RZ^{D}$), and instead of splitting exactly at the median, they add a small amount of "jitter". These so defined tree has been shown to be adaptive to intrinsic dimension(in the following sense) when the $k$-d tree does not [$DF08$].
%\paragraph{Theorem 1.[$DF08$]} There is a constant $c_1$ with the following property. Suppose an RP tree is built using data set $S\subset \RZ^{D}$. Pick any cell $C$ in the RP tree; suppose that $S\cap C$ has doubling dimension $\leq d$. Then with probability at least $1/2$ (over the randomization in constructing the subtree rooted at $C$), for every descendant $C'$ which is more than $c_1d\log d$ levels below $C$, we have radius($C'$) $\leq$ radius($C$)$/2$. 

%\subsubsection{Tree-based regression}
%A regression based on a tree takes as input a data set of $n$ pairs ($X,Y$),where $X\in \RZ^D$, builds a tree each of whose nodes is a cell of $\mathcal{R}^D$ and fits a simple continuous function to the data in each leaf of the tree. Here we will use the $cross$-$validation$ $stopping$ rule and the $automatic$ $stopping$ rule to stop growing the tree and select the final partition. 

%The cross-validation option requires a validation sample \texttt{(X',Y')} also of size $n$, drawn from the same underlying distribution. And the partition $\mathcal{A}^j$ that minimizes the empirical risk
%\begin{center}
%$R'_n(h_{n,\mathcal{A}^j}) := \frac{1}{n}\sum\limits_{i=1}^n|Y'_i - h_{n,\mathcal{A}^j}(X'_i)|^2$
%\end{center}
%is chosen for the regression.

\subsection{Set up}
Here we follow and adapt for our purposes the setting and notation of [Kpo11].
Let \texttt{X} be the set of predictor data points and $\mathcal{A}$ be a partition of $\RZ^{D}$.
For a cell $A\in \mathcal{A}$ we denote by $\Delta(A)$ 
\begin{center}
$\Delta(A):=\max\limits_{x, x'\in A}\| x-x'\|$
\end{center}
the {\em{physical diameter}} of $A$, by $\Delta_n(A)$
\begin{center}
$\Delta_n(A):= \max\limits_{x,x'\in A\cap \texttt{X}} \| x-x'\|$
\end{center}
the {\em{data diameter}} of cells $A\in \mathcal{A}$ or $0$ if $A\cap\texttt{X}$ is empty and by $\Delta_n(\mathcal{A})$

\begin{center}
$\Delta_n(\mathcal{A}):= \left(\sum\limits_{A\in\mathcal{A}}\mu_n(A)\Delta_n^{2\alpha}(A)\right)^{\frac{1}{2\alpha}}$
\end{center} 

the $\alpha$-{\em{average data diameter}} of cells $A\in \mathcal{A}$.\\
	Let $(X_i,Y_i)_{i=1,2,...,n}$ be i.i.d $\RZ^{D+1}$ valued random vectors, where the predictor $X$ lies in a space $\mathcal{X}\subset \RZ^{D}$ and the corresponding univariate response $Y$ lies in a space $\mathcal{Y}\subset \RZ$. We assume that the two spaces $\mathcal{X}$, $\mathcal{Y}$ lie within balls of diameter $\Delta_{\mathcal{X}}$ and $\Delta_{\mathcal{Y}}$ respectively, and that $\mathcal{X}$ is convex. On $\RZ^D$ we will use the Euclidean norm. Let $\nu$ be the unknown underlying distribution over joint predictor-response space $\mathcal{X}\times\mathcal{Y}$ and $\mu$ be the marginal distribution over $\mathcal{X}$. 
	The data set $(X_i,Y_i)_{i=1,2,...,n}$ defines an empirical distribution which assigns mass $1/n$ to each of the $n$ support points. We denote $\mu_n$ the marginal empirical distribution over $\mathcal{X}$.
	
	Our goal consists to estimate ${h(x) := E[Y|X=x]}$ nonparametrically. It is knowns that the rate of convergence of $h$ depends of how smooth it is. Here we assume that $h$ is ($\alpha,C$)-smooth. This is:

\paragraph{Definition 2.} 
Let A $\subseteq R^D$ , $C > 0$ ,  $\alpha = m+ r$ with m $\in \NZ_0$ and $0 < r \leq 1$. A function $h$ : A $\rightarrow R$ is called ($\alpha$,C)-$smooth$ if for every $\beta$ = ($\beta_1$ ,$\beta_2$ ,...,$\beta_D$), $\beta_i \in N_0$, $\sum_{i=1}^D \beta_i = m$, the partial derivative $\frac{\partial^m h}{\partial x_1^{\beta_1}...\partial x_D^{\beta_D}}$ exists and satisfies:

\begin{center}
$\left|\frac{\partial^m h}{\partial x_1^{\beta_1}...\partial x_D^{\beta_D}}(x)-\frac{\partial^m h}{\partial x_1^{\beta_1}...\partial x_D^{\beta_D}}(y)\right| \leq  C.\|x-y\|^r \quad  (x,y \in  A).$
\end{center}	

For some estimate $h_n$: $\mathcal{X} \rightarrow\mathcal{Y}$ of $h$, we define 
%its pointwise risk at $x$ to be $R(h_n,x) := E_{Y|X=x}|Y-h_n(x)|^2$ and its integrated risk to be $R(h_n):= E_{X}R(h_n,X)$. Introducing the target function in the pointwise risk of $h_n$ at $x$, we obtain
%\begin{center}
% $R(h_n,x)=R(h,x)+ |h(x)-h_n(x)|^2$
%\end{center}
%\begin{center}
%$R(h_n)=R(h)+ E_X|h(x)-h_n(x)|^2$
%\end{center}
the integrated excess risk of $h_n$ over $h$ as the average $L_2$ error $E\|h_n-h\|^2$ where
\begin{center}
 $\|h_n-h\|^2: = E_X|h(x)-h_n(x)|^2$
\end{center}

Let $P_m$ be the vector space of polynomials of degree $m$ on $\RZ^D$ and $H_{m,A}$

\begin{center} 
$H_{m,A} :=\{h(x)=\texttt{1}_{\{x\in A\}}p(x);\quad p\in P_m\}$
\end{center}

 be the vector space of polynomials of degree $m$ on $A$ for a fix $A\subset \RZ^D$ and $D_m$ the dimension of $H_{m,A}$. 
 
 In this paper we will consider a piecewise polynomial regressor over $\mathcal{A}$ defined as follows: for any $x\in \mathcal{X}$
 
\begin{center}
$h_{n,\mathcal{A}}(x) = \sum\limits_{A\in \mathcal{A}} h_{n,A}(x).\texttt{1}_A(x)$,
\end{center}
where $h_{n,A}$ 
\begin{equation}
h_{n,A}(.) := \mathrm{arg}\min\limits_{h\in H_{m,A}} \frac{1}{n}\sum\limits_{i\in I(A)} |h(X_i)-Y_i|^2.
\end{equation}
is the polynomial on $H_{m,A}$ that minimizes the empirical least square error and $I(A) = \{1 \leq i \leq n: X_i \in A \}$ is the indices of points falling in the cell $A$.\\
Let ${\tilde{h}_{n,\mathcal{A}} := E_Y(h_{n,A})}$ be the expected estimator where $E_Y(.) := E(.|X_1,...,X_n)$. For $M>0$ we define the truncation operator $T_M$ as follows

\begin{numcases}{T_M(u) =}
 u, & $\mbox{falls} \quad |u| \leq M \nonumber$ \\
 M\mathrm{sign}(u), & $\mbox{sonst} \nonumber$
\end{numcases}

and

\begin{center}
$T_M H_{m,A} = \{T_M h : h\in H_{m,A}\}$
\end{center}
and let
\begin{center}
$\sigma^2 := \sup\limits_{x\in \mathcal{X}} Var\{Y|X = x\} < \infty$.
\end{center}

\subsection{Recursive construction of partitions(randomized)}
A Linear Split Tree (LST) partition $\mathcal{A}$ is a partition obtained through recursive splitting by hyperplanes. The level of a partition $\mathcal{A}$ is equal to its depth in the tree. The various types of LST partitions differ in the splitting rule of cells. Here we consider the Das-Kpo (Dasgupta and Kpotufe) LST partitions that have following properties:
\begin{itemize}
\item The algorithm takes as input a data set $(X_1,...X_n)$ and builds a sequence of LST partitons $\mathcal{A}^0,\mathcal{A}^1,\mathcal{A}^2$... with the properties $\Delta_n(\mathcal{A}^i) \leq 2^{-i}\Delta_n(\mathcal{A}^0)$.
\item In the case of fix design we have:\\
\textbf{Lemma 1.1} Suppose $(X_1,...X_n)$ are fix in $\mathcal{X}$ and $\mathcal{X}$ has $doubling$ dimension $d$, then with probability at least $1-\delta$ it holds \\
$level(\mathcal{A}^i)\leq ki$
where $k \leq C_0d\log d$ for a absolute constant $C_0$.
\item The hyperplanes directions used belong to a fixed collection $\mathcal{P}$ independent of \texttt{X} with\\
 $|\mathcal{P}| \leq 8n^2\log(3n/\delta)$. [$Kpo11$] 
\end{itemize}
To prune the sequence of partitions and select the final partition, two stopping rules are considered namely the $cross$-$validation$ stopping rule and the $automatic$ stopping rule. 
The cross-validation option requires a validation sample \texttt{(X',Y')} also of size $n$, drawn from the same underlying distribution. And the partition $\mathcal{A}^j$ that minimizes the empirical risk
\begin{center}
$R'_n(h_{n,\mathcal{A}^j}) := \frac{1}{n}\sum\limits_{i=1}^n|Y'_i - h_{n,\mathcal{A}^j}(X'_i)|^2$
\end{center}
is chosen for the regression.\\

{\small
\begin{tabular}{|l|} \hline
Procedure \it{ adaptiveRPtree(sample $X\subset \mathbb{R}^{D}$, confidence parameter  $\delta$} ) \\ \hline
 \it $\mathcal{A}^{0}\leftarrow \mathbb{R}^{D};$ \\
\bf{for} $i\leftarrow 1 \to \infty $ \bf{ do}
\\ \quad \bf{ foreach} \it cell $A\in \mathcal{A}^{i-1}$\bf{ do}
\\ \quad \quad (subtree rooted at $A)\leftarrow coreRPtree \quad (A,\Delta_n(A)/2,\delta)$;
\\ \quad \bf{ end}
\\ \quad $\mathcal{A}^{i}\leftarrow$ partition of $\mathbb{R}^{D}$ defined by  the leaves of the current tree; 
\\ \quad $level(\mathcal{A}^{i})\leftarrow \max_{A\in\mathcal{A}^{i}}level(A)$;// $level=depth$ in tree
\\ \quad// There are two options for stopping and returning a partition.
\\ \quad \begin{tabular}{|l|} \hline Option  1:  Cross-validation\\\hline \end{tabular}
\\ \quad \bf{if }$\Delta^{\alpha}_n(\mathcal{A}^{i})=0$ or $level(\mathcal{A}^{i})\geq 2\log n$\bf{ then}
\\ \quad \quad Define $R'_n(.)$ as the empirical risk on a validation sample   $(X',Y')$ of size $n$;
\\ \quad \quad $\mathcal{A}^{*}\leftarrow \arg\min_{\mathcal{A}\in\{\mathcal{A}^0,...,\mathcal{A}^{i}\}}  R'_n(h_{n,\mathcal{A}})$
 \\ \quad \quad \bf{ return }$h_n\doteq h_{n,\mathcal{A}^{*}}$;
 \\ \quad \bf{end}
 \\ \quad \begin{tabular}{|l|}\hline Option 2: Automatic stopping\\\hline \end{tabular}
 
 \\\quad \bf{if}  $\Delta^{2\alpha}_n(\mathcal{A}^{i})\leq \Delta^{2\alpha}_n(\mathcal{A}^0).\big(\frac{\log^2n}{n}\big).2^{level(\mathcal{A}^{i})}$ \bf{then}
 \\ \quad \quad $\mathcal{A}^{*}\leftarrow \arg\min_{\mathcal{A}\in\{\mathcal{A}^{i-1},\mathcal{A}^{i}\}}\Big(\frac{\log^2n}{n}.|\mathcal{A}|+\Delta^{2\alpha}_n(\mathcal{A})\Big)$;
 \\ \quad \quad \bf{return} $h_n\doteq h_{n,\mathcal{A}^*}$;
 \\\quad \bf{end}
 \\ \bf{end}
 \\ \hline
\end{tabular}
}

\vspace{0.5cm}

\subsection{Main results and related work}
Our bound for the excess risk of the regressor is expressed in terms of the {\em{diameter decrease rate}} $k$ which is defined to be the maximum increase in tree depth during each of these individual growth spurts. \\
Building upon the recent work of Dasgupta and Freund [DF08] as above-mentioned, we have

\paragraph{Theorem 1.1} Assume that $\mathcal{X}$ has {\em{doubling}} dimension $d$ and h be ($\alpha$,C)-{\em{smooth}}. Assume furthermore that
 \begin{center}
$\|h\|_\infty = \sup\limits_{x\in \mathcal{X}}|h(x)|\leq M \quad $and let \quad $\hat{h}_{n,A}(.) = T_M h_{n,A}(.)$
\end{center}
For any $\delta > 0$, 
\begin{itemize}
\item if the {\em{automatic stopping}} option is used, we have
with probability at least $1-\delta$ over the choice of \texttt{X}, 
\begin{center}
$E_Y\|\hat{h}_{n,\mathcal{A}} - h\|^2 = O\bigg(\bigg(\frac{\gamma(n)}{n}\bigg)^{\frac{2\alpha}{2\alpha +k}}\bigg)$
\end{center}
.\\
\item if the {\em{cross-validation}} option is used, with probability at least $1-\delta$ over the choice of \texttt{X}, $(X',Y')$ and the randomness in the algorithm,
\begin{center}
$E_Y\|\hat{h}_{n,\mathcal{A}} - h\|^2 = O\bigg(\bigg(\frac{\gamma(n)}{n}\bigg)^{\frac{2\alpha}{2\alpha +k}}\bigg)$
\end{center}
\end{itemize}
where $\gamma(n):=\log^2n$ and $k=C_0d\log d$ for some constant $C_0$.
\\

In the case $\alpha = 1$ (e.g. $m=0$ and $r=1)$ our result yields that of the recent work of Kpotufe [$Kpo11$]. In fact, building upon the work of Dasgupta and Freund, Kpotufe assumes a Lipschitz condition for the regression function and obtained the same rate in this case.\\
Bickel and Li [$BL07$] have recently shown that local kernel regressors are adaptive to manifold structure. Their risk is function of the bandwidth that can be optimally chosen in order to obtain a risk that depends only on the manifold dimension. 

\section{Risk bound for the regressor with respect to the empirical norm $\|.\|_n$}

Let $\alpha \in [1,\infty)$ and assume that $\mathcal{X}$ is convex and has $Assouad$ Dimension $d$. In this section we will upper bound the integrated risk with respect to the empirical $L_2$ norm defined for all A $\subseteq R^D$ and $h$ : A $\rightarrow R$ as follows

\begin{center}
$\|h\|_n^2 = \int_{A}|h(x)|^2\mu_n(dx) = \frac{1}{n}\sum\limits_{i\in I(A)}|h(X_i)|^2.$
\end{center} 

For this section \texttt{X} is considered fixed. The following lemma will be usefully in the following.

\paragraph{Lemma 2.1} $[St93]$ Let $\{h_{1,n},...,h_{{D_m},n}\}$ be a Basis of $H_{m,A}$. It is:
\begin{equation}
h_{n,A} = \sum\limits_{l=1}^{D_m}a_l h_{l,n} \quad with \quad Z^TZa = Z^TY 
\end{equation} 
 where $Y = \left(\{Y_i\}_{i\in I(A)}\right)^T$, $a = (a_1,...,a_{D_m})^T$ and $Z = \left(h_{l,n}(X_i)\right)_{i\in I(A),l=1,...,{D_m}}$.

\subsection{Bias-variance decomposition of the risk}

Here we bound our integrated risk for any partition $\mathcal{A}$ of $\mathcal{X}$ as follows:

{\small\
\begin{equation*}
\begin{array}{lll}
E_Y\left(\|h_{n,\mathcal{A}} - h\|_n^2\right) & & \\
\quad\quad\quad = \sum\limits_{A\in\mathcal{A}} E_Y\left(\int_A|h_{n,A}(x)-h(x)|^2\mu_n(dx)\right)  & & \\
\quad\quad\quad \leq \sum\limits_{A\in\mathcal{A}} E_Y\left(\int_{A}(|h_{n,A}(x)-\tilde{h}_{n,A}(x)|+|\tilde{h}_{n,A}(x)-h(x)|)^2\mu_n(dx)\right) & & \\
 \quad\quad\quad \leq \sum\limits_{A\in\mathcal{A}} E_Y\left(\int_{A}(2|h_{n,A}(x)-\tilde{h}_{n,A}(x)|^2 +2|\tilde{h}_{n,A}(x)-h(x)|^2)\mu_n(dx)\right) & & \\
\quad\quad\quad = \sum\limits_{A\in\mathcal{A}} 2\left(E_Y\left(\|(h_{n,A}-\tilde{h}_{n,A})\|_n^2 \right) + E_Y\left(\|\tilde{h}_{n,A}-h\mathbf{1}_{\{x\in A\}}\|_n^2 \right)\right) & & \\
\end{array} 
\end{equation*}
}
Our next goal is to upper bound the bias and the variance. To bound the bias we will need the following lemma.

\paragraph{Lemma 2.2 [GKKW02]}
let $M \in \mathcal{N}$, $m \in \{1,2,...,M\}$, $\alpha = m + r$, $r \in (0,1]$ and $C > 0$. Let $s_0 \in \mathring {I} \subset R$ and  $g$ : $I \rightarrow R$ ($\alpha$,C)-$smooth$.

Then there exists a polynomial $g_0$ of degree $M$(or less), such that:
\begin{equation*}
|g_0(s) - g(s)| \leq \frac{C}{m!}\cdot|s-s_0|^\alpha \quad ( s \in I)
\end{equation*}
In particular,\\
 let $A$ be a cell in $\RZ^D$ and let $z_0 \in \texttt{X}\cap A$ fix. For any $z = z_0+ t\cdot v \in A$ for some direction $v \in$ $\RZ^D$, $t\in$ $\RZ$ such that $h(z)=h_v(t)=h(z_0+t\cdot v)$, we have 
\begin{equation*}
 |h(z) - h_0(z)| = |h_v(t) - h_{0_v}(t)| \leq \frac{C}{m!}\cdot|t-0|^\alpha = \|z-z_0\|^\alpha
\end{equation*}
Hence,
\begin{equation}
 \sup\limits_{z\in\texttt{X}}|h(z) - h_0(z)| \leq \frac{C}{m!}\cdot \Delta_n^\alpha(A)
\end{equation}


\paragraph{Lemma 2.3 (Bias)} 
Fix any partition $\mathcal{A}$ and any set of $n$ points $\texttt{X}=\{X_1,...,X_n\}\subset \mathcal{X}$. Then:
\begin{center}
$\|\tilde{h}_{n,A}-h\mathbf{1}_{\{x\in A\}}\|_n^2 \leq  \frac{C^2}{(m!)^2}. \mu_n(A)\Delta_n^{2\alpha}(A)$ 
\end{center}

{\em{Proof}}: It is:
$h_{n,A} = \sum\limits_{i=1}^{D_n}a_i h_{i,n}$ \quad (lemma 2.1)

we also have $E_Y(h_{n,A}) = \sum\limits_{i=1}^{D_n}E_Y(a_i) h_{i,n}$ where $E_Y(a) = (\{E_Y(a_i)\}_{i = 1,...,D_n})^T$. Applying $E_Y$  to (2), we obtain:
\begin{center}
$\frac{1}{n}{Z^TZ}(E_Y(a)) = \frac{1}{n}Z^T(E_Y\{Y\}) = \frac{1}{n}Z^T(\{h(X_i)\}_{i\in I(A)})^T$
\end{center}
then $E_Y(h_{n,A})$  $(= \tilde{h}_{n,A}$) is the least square estimator in $H_{m,A}$ with respect to $(X_i,h(X_i))_{i\in I(A)}$. Hence:

\begin{equation*}
\begin{array}{lll}
\|\tilde{h}_{n,A}-h\mathbf{1}_{\{x\in A\}}\|_n^2 & =& \frac{1}{n}\sum\limits_{i\in I(A)}|\tilde{h}_{n,A}(X_i)-h(X_i)|^2 \\
%& =& \min\limits_{h' \in H_{m,A}} \|h'-h\mathbf{1}_{\{x\in A\}}\|_n^2 \\
& =& \frac{1}{n}\min\limits_{h'\in H_{m,A}}\sum\limits_{i\in I(A)}|h'(X_i)-h(X_i)|^2 \\
& \leq & \frac{1}{n}\sum\limits_{i\in I(A)}|h_0(X_i)-h(X_i)|^2 \\
& \leq & \frac{|I(A)|}{n}\frac{C^2}{(m!)^2}\Delta_n^{2\alpha}(A) \\
& =& \frac{C^2}{(m!)^2}. \mu_n(A)\Delta_n^{2\alpha}(A) \\
\end{array}
\end{equation*} \hfill $\square$

where the first and the second inequality use lemma 2.2.\\

\paragraph{Lemma 2.4 (Variance)}[GKKW02]
Fix any partition $\mathcal{A}$ and any set of $n$ points $\texttt{X}=\{X_1,...,X_n\}\subset \mathcal{X}$. Let $H_{m,A}$ provided with the empirical scalar product $<.,.>_n $ given by
:
\begin{center}
$<h_1,h_2>_n = \frac{1}{n}\sum\limits_{i\in I(A)}h_1(X_i)h_2(X_i) $  , where $ \quad h_1,h_2 \in H_{m,A}$
\end{center}
Then
\begin{equation}
E_Y\left(\|(h_{n,A}-\tilde{h}_{n,A})\|_n^2 \right) \leq \frac{\sigma^2}{n}D_m.
\end{equation} 

\subsection{A bound for the risk}
Note that if one assumes 
$\|h.1_A\|_\infty \leq M$  for all $A\in \mathcal{A}$, then due to\\
 ${|\hat{h}_{n,A}(x) - h.1_A(x)| \leq |h_{n,A}(x) - h.1_A(x)|}$ for all $x$; follows:
\begin{center}
${\|\hat{h}_{n,A} - h.1_A\|_n^2 \leq \|h_{n,A} - h.1_A\|_n^2},$
\end{center}

   We can now formulate the main theorem of this section.
   
\paragraph{Theorem 2.1}
Assume $h$ be ($\alpha$,C)-{\em{smooth}} and
\begin{center}
$\|h\|_\infty = \sup\limits_{x\in \mathcal{X}}|h(x)|\leq M \quad \quad $and let$ \quad \quad \hat{h}_{n,A}(.) = T_M h_{n,A}(.)$
\end{center}
 then for any fixed partition $\mathcal{A}$ and any set of $n$ points $\texttt{X}=\{X_1,...,X_n\}\subset \mathcal{X}$ fix, 
\begin{equation}
E_Y\left(\|\hat{h}_{n,\mathcal{A}} - h\|_n^2\right) \leq 2\bigg(|\mathcal{A}| \frac{\sigma^2}{n}.(m+1)^D
 + \frac{C^2}{(m!)^2}\Delta_n^{2\alpha}(\mathcal{A})\bigg)
\end{equation} 


{\small
{\em{Proof}}: It is:
\begin{equation*}
\begin{array}{lll} 
E_Y\left(\|\hat{h}_{n,\mathcal{A}} - h\|_n^2\right) & \leq &
%E_Y\left(\|h_{n,\mathcal{A}} - h\|_n^2\right)\\
\sum\limits_{A\in\mathcal{A}} 2\left(E_Y\left(\|(h_{n,A}-\tilde{h}_{n,A})\|_n^2 \right) + E_Y\left(\|\tilde{h}_{n,A}-h\mathbf{1}_{\{x\in A\}}\|_n^2 \right)\right) \\
  & \leq & \sum\limits_{A\in\mathcal{A}} 2\left(\frac{\sigma^2}{n}D_m +  \frac{C^2}{(m!)^2}. \mu_n(A)\Delta_n^{2\alpha}(A)\right) \\
 & \leq & 2\bigg(|\mathcal{A}| \frac{\sigma^2}{n}.(m+1)^D
 + \frac{C^2}{(m!)^2}\Delta_n^{2\alpha}(\mathcal{A})\bigg)  
  \\
\end{array}
\end{equation*} 
}
where the second inequality uses the Lemmas 2.3 and 2.4 and the last inequality uses
 $D_m \leq (m+1)^D$.   \hfill $\square$
 
 \subsection{Risk bound in form of {\em{diameter decrease rate}} with respect to the empirical norm $\|.\|_n$} 
 [$Kpo11$] used the stopping options both the $cross$-$validation$ option and the $automatic$-$stopping$ option in the context of Lipschitz condition for $h$. Here we adapt their idea and their proofs to our case. \\
 Recall that the \texttt{adaptiveRPtree} procedure starts with a partition $\mathcal{A}^0$ that has a single cell containing all the data, and then grows the tree to get increasingly finer partitions $\mathcal{A}^1$, $\mathcal{A}^2$,..., where the data diameter of each $\mathcal{A}^i$ is half that of $\mathcal{A}^{i-1}$. The $diameter$ $decrease$ $rate$, denoted $k$, is defined to be the maximum increase in tree depth during each of these individual growth spurts.\\
 It is:
 \begin{center}
$\Delta_n(\mathcal{A}^i) \leq 2^{-i}\Delta_n(\mathcal{A}^0)$.
\end{center}
\begin{center}
$level(\mathcal{A}^i) \leq ki$.
\end{center}
 
 
\subsubsection{Risk bound for {\em{automatic-stopping option}}}
 The {\em{automatic stopping}} criterion stops growing the tree as soon as
\begin{center}
 $\frac{\Delta_n^{2\alpha}(\mathcal{A}^i)}{\Delta_n^{2\alpha}(\mathcal{A}^0)} \leq \frac{\gamma(n)}{n}2^{level(\mathcal{A}^i)}$
\end{center}
and the shrinkage in diameter is expected to be roughly

\begin{equation*}
\zeta := \left(\frac{\gamma(n)}{n}\right)^{1/(2\alpha + k)}
\end{equation*}

\paragraph{Lemma 2.5}
Suppose {\em{automatic stopping}} option is used and \texttt{adaptiveRPtree} attains a {\em{diameter decrease rate}} of $k \geq d$ on $X$ and let $\zeta := \left(\frac{\gamma(n)}{n}\right)^{1/(2\alpha + k)}$. Assume ${n \geq \gamma(n) \geq 1}$ then the final partition $\mathcal{A}^*$ retained for regression satisfies 
\begin{center}
$\left(\frac{\gamma(n)}{n}.|\mathcal{A}^*| + \Delta_n^ {2\alpha}(\mathcal{A}^*)\right) \leq (4^{\alpha}\Delta_n^{2\alpha}(\mathcal{X}) + 1)\zeta^{2\alpha}$.
\end{center}

{\em{Proof}}: Let $\mathcal{A}^0$, $\mathcal{A}^1$,... be the partitions found by \texttt{adaptiveRPtree} and suppose the stopping criterion holds for  $\mathcal{A}^i$. We consider two cases:\\

case 1: $level(\mathcal{A}^i$) $\leq k\log(1/\zeta)$. Then $|\mathcal{A}^i| \leq (1/\zeta)^k$ and by the stopping condition:
\begin{center}
$\frac{\Delta_n^{2\alpha}(\mathcal{A}^i)}{\Delta_n^{2\alpha}(\mathcal{A}^0)} \leq \frac{\gamma(n)}{n}2^{level(\mathcal{A}^i)} \leq \frac{\gamma(n)}{n}\left(\frac{1}{\zeta}\right)^k = \zeta^{2\alpha}$
\end{center} 
case 2: $level(\mathcal{A}^i) > k\log(1/\zeta)$. Then $ki \geq level(\mathcal{A}^i) \geq k\log(1/\zeta)$ implying that $i-1 \geq log(1/2\zeta)$, whereupon $\Delta_n(\mathcal{A}^{i-1}) \leq 2\zeta\Delta_n(\mathcal{A}^0)$ . Moreover, since the stopping condition doesn't hold for $\mathcal{A}^{i-1}$ we have ${level(\mathcal{A}^{i-1}) <  k\log(1/\zeta)}$.\\
In either case at least one of $\mathcal{A}^i$ and $\mathcal{A}^{i-1}$ has size at most $(1/\zeta)^k$ and diameter at most $2\zeta.\Delta_n(\mathcal{A}^0)$. It follows that

\begin{center}
$\begin{array}{lll}
\min\limits_{j\in\{i-1,i\}} \left(\frac{\gamma(n)}{n}.|\mathcal{A}^j| + \Delta_n^ {2\alpha}(\mathcal{A}^j)\right) &\leq & \frac{\gamma(n)}{n}\left(\frac{1}{\zeta}\right)^k + 4^{\alpha}\zeta^{2\alpha}.\Delta_n^{2\alpha}(\mathcal{X}) \\
& = &(4^{\alpha}\Delta_n^{2\alpha}(\mathcal{X}) + 1)\zeta^{2\alpha}.
\end{array}$
\end{center} \hfill $\square$ 

\paragraph{Corollary 2.1} Under the assumption of Lemma 2.5 and Theorem 2.1 the excess risk of the regressor with respect to the empirical norm $\|.\|_n$ is bounded by :
\begin{center}
$E_Y\left(\|h_{n,\mathcal{A}} - h\|_n^2\right) \leq 2^{2\alpha+1}\left(\sigma^2(m+1)^D + \frac{C^2}{(m!)^2}\right)\left(\Delta^{2\alpha}_\mathcal{X} + 1\right).\left(\frac{\gamma(n)}{n}\right)^{\frac{2\alpha}{2\alpha + k}}$
\end{center}

{\em{Proof}}: By use of Theorem 2.1 and Lemma 2.5 we have:

{\small \begin{equation*}
\begin{array}{lll}
E_Y\left(\|h_{n,\mathcal{A}^*} - h\|_n^2\right) & \leq & 2\bigg(|\mathcal{A}^*| \frac{\sigma^2}{n}.(m+1)^D + \frac{C^2}{(m!)^2}\Delta_n^{2\alpha}(\mathcal{A}^*)\bigg)\\
 & \leq & 2\left(\sigma^2(m+1)^D + \frac{C^2}{(m!)^2}\right)\bigg(|\mathcal{A}^*| \frac{\sigma^2}{n} + \Delta_n^{2\alpha}(\mathcal{A}^*)\bigg)\\
 & \leq & 2^{2\alpha+1} \left(\sigma^2(m+1)^D + \frac{C^2}{(m!)^2}\right)\left(\Delta^{2\alpha}_\mathcal{X} + 1\right)\zeta^{2\alpha}
\end{array}
\end{equation*}
 \hfill $\square$\\
}
where $\gamma(n)=\sigma^2$.\\
In the case $k < d$ replace $k$ by $k_0:=\max\{k,d\}$ in the $shrinkage$.
 

\section{Risk bound for the regressor with respect to the population $L_2$-norm}
Here we will bound the risk of our regressor with respect to the population $L_2$-norm. In order to do this we will need the following notions and the results of section 2.

\subsection{Covering and Packing Numbers}

\paragraph{Definition 3.1 ($\epsilon$-{\em{covering number}})} Let $\epsilon >0$, let $H$ be a set of functions  $R^D\rightarrow R$, $1\leq p <\infty$, and let $\mu$ be a probability measure on $R^D$. The minimal $N$ such that there exist functions $h_1, ... ,h_N: R^D\rightarrow R$ with the property that for every $h\in H$ there is a $j=j(h)\in\{1,...,N\}$ such that
 \begin{center}
 $\|h-h_j\|_{L_p(\mu)}<\epsilon$
 \end{center}
is called a $\epsilon$-{\em{covering number}} of $H$ with respect to $\|\cdot\|_{L_p(\mu)}$ and denoted by $N(\epsilon,H,\|\cdot\|_{L_p(\mu)})$. 

If $\mu = \mu_n$ is the empirical measure on $x_1^n = (x_1,...,x_n)$, $n$ fixed points in $R^D$, then the $\epsilon$-{\em{covering number}} of $H$ with respect to  $\|\cdot\|_{L_p(\mu_n)}$ will be denoted by $N_p(\epsilon,H,x_1^n)$.



\paragraph{Definition 3.2 ($\epsilon$-{\em{packing number}})} Let $\epsilon >0$, let $H$ be a set of functions  $R^D\rightarrow R$, $1\leq p <\infty$, and let $\mu$ be a probability measure on $R^D$. The maximal $N$ such that there exist functions $h_1, ... ,h_N: R^D\rightarrow R$ in $H$ with
 \begin{center}
 $\|h_i-h_j\|_{L_p(\mu)} \geq \epsilon$ \quad for all $1\leq i < j \leq N$
 \end{center}
is called a $\epsilon$-{\em{packing number}} of $H$ with respect to $\|\cdot\|_{L_p(\mu)}$ and denoted by $M(\epsilon,H,\|\cdot\|_{L_p(\mu)})$. 

Analogous, if $\mu = \mu_n$ is the empirical measure on $x_1^n = (x_1,...,x_n)$, $n$ fixed points in $R^D$, then the $\epsilon$-{\em{packing number}} of $H$ with respect to  $\|\cdot\|_{L_p(\mu_n)}$ will be denoted by $M_p(\epsilon,H,x_1^n)$.\\
In the next lemma we see that the $L_p$ packing numbers and the $L_p$ covering numbers are closely related.


\paragraph{Lemma 3.1} [GKKW02] Let $H$ be a class of functions on $R^D$, $1\leq p <\infty$ and let $\mu$ a probability measure on $R^D$. Let $\epsilon > 0$. Then:

\begin{center}
$M_p(2\epsilon,H,\|\cdot\|_{L_p(\mu)}) \leq N_p(\epsilon,H,\|\cdot\|_{L_p(\mu)}) \leq M_p(\epsilon,H,\|\cdot\|_{L_p(\mu)})$
\end{center}
In particular,

\begin{center}
$M_p(2\epsilon,H,x_1^n) \leq N_p(\epsilon,H,x_1^n) \leq M_p(\epsilon,H,x_1^n) \quad $for all $x_1,...,x_n \in R^D$.
\end{center}

\subsection{A bound for the $\epsilon$-{\em{packing number}} of $H_{m,\mathcal{C}_n}$}
Here we first define the sets $H_{m,\mathcal{C}_n}$. Note that if one makes use of the fact that the random directions for hyperplane splits used in the tree are chosen at random independently of the sample \texttt{X}, this allows to assume that the random directions are taken from a fixed collection $\mathcal{P}$. Hence the size $|\mathcal{P}|$ can be upper bound by $8n^2\log(3n/\delta)$ according to the implementation of \texttt{coreRPtree}(see appendix). Let  $\mathcal{H}_{\mathcal{P}}$ be the class of half spaces of $\mathcal{R}^D$  defined by hyperplanes normal to the directions in the fixed collection $\mathcal{P}$. Since the tree has height at most $2\log 2n$, each cell of any partition $\mathcal{A}$ is the intersection of at most $2\log 2n$ elements of $\mathcal{H}_{\mathcal{P}}$ (see also [$Kpo11$]). One can see that all such cells are contained in the following subset of $\mathcal{R}^D$:

\begin{center}
$\mathcal{C}_n = \left\{ h : h =\mathcal{X}\cap \left( \bigcap\limits_{j=1}^{2\log 2n} h_j\right), h_j \in \mathcal{H}_{\mathcal{P}}\right\}$
\end{center}
We define also
\begin{center} 
$\begin{array}{lll}
H_{m,\mathcal{C}_n} & := & \{h(x)=\texttt{1}_{\{x\in A\}}p(x);\quad p\in P_m, A \in \mathcal{C}_n \}
\\
H_{m,A}^+ & := & \{\{(x,z)\in \mathcal{R}^D\times \mathcal{R}: h(x)\geq z ; \quad h\in H_{m,A}\}
\\
H_{m,\mathcal{C}_n}^+ & := & \{\{(x,z)\in \mathcal{R}^D\times \mathcal{R}: h(x)\geq z ; \quad h\in H_{m,\mathcal{C}_n}\}
\end{array}$
\end{center}

\paragraph{Definition 3.3 ({\em{nth shatter coefficient}} and $VC$ dimension)} Let $\mathcal{A}$ be a class of subsets of $\mathcal{R}^D$ and let $n\in \mathcal{N}$. The {\em{nth shatter coefficient}} of $\mathcal{A}$, denoted $S(\mathcal{A},n)$, is the maximal number of different subsets of $n$ points that can be picked out by sets from $\mathcal{A}$. That is
\begin{center}
$S(\mathcal{A},n) := \max\limits_{\{x_1,...,x_n\}\subseteq \mathcal{R}^D}  |\{A\cap\{x_1,...,x_n\} : A\in \mathcal{A}\}|$
\end{center}
and the $VC$ dimension $V_{\mathcal{A}}$ is defined for all non empty $\mathcal{A}$ by 
\begin{center}
$V_\mathcal{A} = \sup\{n \in \mathcal{N}: S(\mathcal{A},n) = 2^n\}$.
\end{center}

Now we want to upper bound the {\em{l-shatter coefficient}} of $H_{m,\mathcal{C}_n}^+$. In the next lemma we will first get a bound as function of the {\em{l-shatter coefficient}} of $H_{m,A}^+$ and the {\em{l-shatter coefficient}} of $\mathcal{C}_n$.

\paragraph{Lemma 3.2}
Let $H_{m,A}^+$, $H_{m,\mathcal{C}_n}^+$ be as defined above. It holds:
\begin{center}
$S(H_{m,\mathcal{C}_n}^+,l) \leq S(H_{m,A}^+,l)\cdot S(\mathcal{C}_n,l)$
\end{center} 

{\em{Proof}}: Let $T_l :=\{(x_1,z_1),...,(x_l,z_l)\} \subset \mathcal{R}^D\times\mathcal{R}$ pairwise different. Define $V(h,T_l) = (\texttt{1}_{\{h(x_1)\geq z_1\}},...,\texttt{1}_{\{h(x_l)\geq z_l\}})$ for some $h \in H_{m,A}$ and $V(H_{m,A},T_l) := \{V(h,T_l); h \in H_{m,A}\}$, then :
\begin{center}
$|V(H_{m,A},T_l)| \leq S(H_{m,A}^+,l)$
\end{center}

Now let $Z(A,T_l) := (\texttt{1}_{\{x_1 \in A\}},...,\texttt{1}_{\{x_l \in A\}})$ and let $U(T_l) := (\texttt{1}_{\{z_1 \leq 0\}},...,\texttt{1}_{\{z_l \leq 0\}})$.
 Define $W(h,A,T_l) = (\texttt{1}_{\{h(x_1) \geq z_1\}},...,\texttt{1}_{\{h(x_l) \geq z_l\}}) $ for some $h \in H_{m,A}$ and let $W(H_{m,A},\mathcal{C}_n,T_l) = \{W(h,A,T_l);  h \in H_{m,A}, A \in \mathcal{C}_n\}$, then:
\begin{center}
$W(h,A,T_l) = V(h,T_l)(.*)Z(A,T_l) + U(T_l)(.*)(1-Z(A,T_l))$,
\end{center}
where (.*) is the component-by-component product of vectors.\\
Hence the inequality $|\{Z(A,T_l); A \in \mathcal{C}_n\}| \leq S(\mathcal{C}_n,l)$ implies:
\begin{center}
$|W(H_{m,A},\mathcal{C}_n,T_l)| \leq S(H_{m,A}^+,l)\cdot S(\mathcal{C}_n,l)$
\end{center} \hfill $\square$

In the following we will then separately upper bound $S(H_{m,A}^+,l)$ and $S(\mathcal{C}_n,l)$. We denote by $s(\mathcal{A},\{x_1,...,x_n\})$ the number of different subsets of $\{x_1,...,x_n\})$ of the form $A\cap \{x_1,...,x_n\}$, $A\in \mathcal{A}$. 

\paragraph{Lemma 3.3} Let  $H_{m,A}^+$ be defined as above. Then
\begin{center}
$V_{T_M H_{m,A}^+} \leq V_{H_{m,A}^+}$
\end{center}

{\em{Proof}}: Let $(x,z) \in \mathcal{R}^D \times \mathcal{R}$. If $z>M$, then $(x,z)$ is not contained in any set of $T_M H_{m,A}^+$. But if $z\leq -M$, then $(x,z)$ is contained in every set of $T_M H_{m,A}^+$.
This implies that if $s(T_M H_{m,A}^+,\{(x_1,z_1),...,(x_n,z_n)\}) = 2^n$, then the $z$-coordinate of these points muss satisfy $-M \leq z \leq M$ and it also holds $s(H_{m,A}^+,\{(x_1,z_1),...,(x_n,z_n)\}) = 2^n$. This ends the proof.
 \hfill $\square$\\

\paragraph{Lemma 3.4} It holds:
\begin{center}
$V_{H_{m,A}^+} \leq D_n+1.$
\end{center}

{\em{Proof}}: This follows direct from Theorem 9.5 [$GKKW02$] and 
\begin{center}
$H_{m,A}^+  \subseteq \{\{(x,z) \in \mathcal{R}^D \times\mathcal{R}:h(x) + \zeta\cdot z \geq 0\} : h \in H_{m,A}, \zeta \in \mathcal{R}\}$.
\end{center} \hfill $\square$

%In order to bound the $l$-$shatter$ coefficient of $\mathcal{A}$ we will need an alternate partition $\mathcal{A}^'$ of $\mathcal{A}^$ with the following properties (see also [$Kpo11$])
%\begin{itemize}
%\item Each cell of $\mathcal{A}$ is the union of two cells of $\mathcal{A}^'$.
%\item Every cell in $\mathcal{A}^'$ is either void of data points or else has a physical diameter that is roughly the same as its data diameter.
%\end{itemize}
%$\mathcal{A}^'$ is obtained by intersecting the cells of $\mathcal{A}$ with balls or complements-of-balls from a fixed, pre-defined collection $\mathcal{B}$. Specially, let $\mathcal{B}_i$ be a cover of $\mathcal{X}$ by balls of radius $\frac{\Delta_{\mathcal{X}}}{2^i}$. Take a variety of scales:  $i = 0, 1, 2,..., I = \lfloor \log n^{\frac{2}{2\alpha + d}}\rfloor $ and define  $\mathcal{B}$ as follows:
%\begin{center}
%$\mathcal{B} =  \bigcup\limits_{i=0}^I \{4B : B \in \mathcal{B}_i\}.$
%\end{center}
%The partition $\mathcal{A}^'$ is found by replacing each cell $A$ of $\mathcal{A}$ by two cells $A_{1}^'$, $A_{2}^'$ as follows
%\begin{itemize}
%\item if $A \cap \texttt{X} = \phi $ then set $A_1' = A$ and $A_2' = \phi$
%\item otherwise: setting $i = min\{I, \lceil \log(\Delta_{\mathcal{X}}/\Delta_n(A))\rceil\}$, one can find a ball $ B \in \mathcal{B}_i $ such that $ A \cap \texttt{X} $  is contained in 4B. Define $ A_1' = A \cap 4B$ and $A_2' = A \backslash A_1' $
%\end{itemize}

\paragraph{Lemma 3.5} Let $\mathcal{C}_n$ be as above-mentioned. Then
\begin{center}
$S(\mathcal{C}_n,l) \leq (2l|\mathcal{P}|+1)^k$
\end{center}
where $k=2\log(2n)$. \\

{\em{Proof}}: Recall that
\begin{center}
$\mathcal{C}_n = \left\{ h : h = \mathcal{X}\cap\left( \bigcap\limits_{j=1}^{2\log 2n} h_j\right) , h_j \in \mathcal{H}_{\mathcal{P}}\right\}$
\end{center}
The lemma follows direct from the fact that for any $v \in \mathcal{P}$ and $l$ sample points, there are at most $2l$ possible intersections of the sample with halfspaces normal to $v$.

%Since $\mathcal{X}$ has $Assouad$ Dimension $d$, then $|\mathcal{B}|\leq \sum\limits_{i=0}^I 2^{di} \leq 2n^{\frac{2d}{2\alpha +d}} < 2n^2$. 
%wegen $\alpha \geq 1$ und $I = \lfloor \log n^{\frac{2}{2\alpha + d}}\rfloor $.\\ 

\paragraph{Lemma 3.6 ({\em{Sauer}})}  Let $\mathcal{A}$ be a set of subsets of $\mathcal{R}^D$ with VC dimension $V_{\mathcal{A}} < \infty $. Then for $l \in \mathcal{N}$
\begin{center}
$S(\mathcal{A},l) \leq (l+1)^{V_{\mathcal{A}}}$
\end{center}
and, for all $l \geq V_{\mathcal{A}}$,
\begin{center}
$S(\mathcal{A},l) \leq \left(\frac{el}{V_{\mathcal{A}}}\right)^{V_{\mathcal{A}}}.$
\end{center}

Now we can upper bound the {\em{$L_p$ packing number}} of $H_{m,\mathcal{A}}$. Our proposition is based on the following lemma which uses the VC dimension of sets
\begin{center} 
$H^+ :=\{\{(x,z)\in \mathcal{R}^D\times \mathcal{R}: h(x)\geq z ; \quad h\in H\}$
\end{center}

%\paragraph{Lemma 3.7} [$GKKW02$] Let $H$ be a class of functions \linebreak ${h : R^D\rightarrow [0,M]}$ with $V_{H^+} \geq 2$, $p \geq 1$, let $\mu$ be a probability measure on $R^D$, and let $0 < \epsilon < \frac{M}{4}$. Then 
%\begin{center}
%$M(\epsilon, H,\|.\|_{L_p(\mu)}) \leq 3\left(\frac{2e.M^p}{\epsilon^p}\log\frac{3e.M^p}{\epsilon^p} \right)^{V_{H^+}} .$
%\end{center}


\paragraph{Lemma 3.8} Let $T_M H_{m,\mathcal{C}_n}$ be defined as above. Let $V_{T_M H_{m,\mathcal{C}_n}^+} \geq 2$, $p \geq 1$ and  let $\mu$ be a probability measure on $R^D$. Let $0 < \epsilon < \frac{2M}{4}$. Then 

\begin{equation*}
\begin{array}{lll}
\mathcal{M}(\epsilon, T_M H_{m,\mathcal{C}_n},\|.\|_{L_p(\mu)}) & & \\
\quad \leq 3 \left(32n\sqrt{e\cdot m_n\cdot\log(3n/\delta)}\frac{(2M)^p}{\epsilon^p}\log(48n\cdot\sqrt{e\cdot m_n\cdot\log(3n/\delta)}\frac{(2M)^p}{\epsilon^p}) \right)^{2\cdot m_n} & &
\end{array}
\end{equation*}

where $m_n := \max{\{D_m+1,2\log(2n)\}}$ and $\delta$ the confidence parameter of the algorithm.

\subsection{A bound of the risk with respect to the population $L_2$-norm}

The following lemma is very usefull to relate our results of section 2. to the risk with respect to the population $L_2$-norm.

\paragraph{Lemma 3.9}  $[GKKW02]$ Let $H$ be a class of functions $h: R^D\rightarrow R$ bounded in absolute value by $M$. Let $\epsilon > 0$. Then

\begin{equation}
P(\exists h \in H: \|h\| - 2\|h\|_n > \epsilon) \leq 3E\mathcal{N}_2\left(\frac{\sqrt{2}}{24}\epsilon, H, X_1^{2n}\right)\exp\left(-\frac{n\epsilon^2}{288M^2} \right)
\end{equation}

Now we can formulate our main Theorem. 

\paragraph{Theorem 3.1} Assume $h$ be $(\alpha,C)$-{\em{smooth}} and
 
\begin{center}
$\|h\|_\infty = \sup\limits_{x\in \mathcal{X}}|h(x)|\leq M. \quad $and let$\quad \hat{h}_{n,A}(.) = T_M h_{n,A}(.)$
\end{center}
For any $\delta > 1/n$ we have with probability at least $1-\delta$ over the choice of $\texttt{X}$, for all partitions obtained during the construction of the tree
\begin{equation}
E_Y\|\hat{h}_{n,\mathcal{A}} - h\|^2 \leq C_1\bigg( \max{\{M^2,\sigma^2\}}\frac{\log^2(n)+1}{n}.D_m.|\mathcal{A}|+ \frac{C^2}{(m!)^2}\Delta_n^{2\alpha}(\mathcal{A})\bigg)
\end{equation}
for some universal constant $C_1$.\\

Now we can derive in a analogous manner as in section 2. a risk bound in form of {\em{diameter decrease rate}}

\subsubsection{Risk bound for $cross$-$validation$ option}
Here we define our shrinkage in diameter $\Delta_n(\mathcal{A}^i)/\Delta_n(\mathcal{A}^0)$ as follows:
\begin{center}
$\zeta :=\left(\frac{\Delta^{2\alpha}_\mathcal{Y}}{C^2\Delta^{2\alpha}_\mathcal{X}}.\frac{\gamma(n)}{n}\right)^{\frac{1}{2\alpha+k}}$
\end{center}

\paragraph{Lemma 3.10} Suppose \texttt{adaptiveRPtree} is run with the $cross$-$validation$ option, and yields a sequence of partitions $\mathcal{A}^0,\mathcal{A}^1,...$ with a $diameter$ $decrease$ $rate$ of $k$. Let $\zeta$ be as above defined and let ${\gamma(n) \geq 1}$. If \\ ${n \geq max\{\gamma(n),C^2\Delta^{2\alpha}_\mathcal{X}/\Delta^{2\alpha}_\mathcal{Y},\gamma(n)\Delta^{2\alpha}_\mathcal{Y}/C^2\Delta^{2\alpha}_\mathcal{X}\}}$, then there exists
$ i \geq 0 $, such that $ \Delta_n(\mathcal{A}^i) \leq 2\zeta.\Delta_n(\mathcal{X}) $ and $ |\mathcal{A}^i| \leq (1/\zeta)^k$.\\

$Proof$: See [$Kpo11$]
 
\paragraph{Corollary 3.1} Under the hypotheses of Lemma 3.10, with probability at least $1-\delta$ over $X$, $(X',Y')$ and the randomness in the algorithm, the excess risk of the final regressor is bounded by:

\begin{equation*}
\begin{array}{lll}
E_Y\left(\|h_{n,\mathcal{A}} - h\|_n^2\right) & \leq & C_0 C^2\Delta^{2\alpha}_\mathcal{X}.\left(M^2\vee \sigma^2\frac{(m+1)^D}{\Delta^{2\alpha}_\mathcal{Y}}+ \frac{4^\alpha}{(m!)^2} \right)\left(\frac{\Delta^{2\alpha}_\mathcal{Y}}{C^2\Delta^{2\alpha}_\mathcal{X}}\frac{\gamma(n)}{n}\right)^{\frac{2\alpha}{2\alpha+k}}\\
 & & + 32\Delta^2_\mathcal{Y}\frac{2\log2n + \ln(1/\delta)}{n},\\
\end{array}
\end{equation*}

for some universal constant $C_0 \in \mathcal{R}$, where $M^2\vee \sigma^2:= \max{\{M^2,\sigma^2\}}$.\\

$Proof$: Let $\mathcal{A}^{i_0}$ and $\zeta$ be as in Lemma 3.10. By applying Theorem 3.1 and Lemma 3.10 we have:
{\small \begin{equation*}
\begin{align}
E_Y\left(\|h_{n,\mathcal{A}^{i_0}} - h\|^2\right) & \leq C_1\left(\max{\{M^2,\sigma^2\}}\frac{\log^2(n)+1}{n}|\mathcal{A}^{i_0}|D_m + \frac{C^2}{(m!)^2}\cdot\Delta_n^{2\alpha}(\mathcal{A}^{i_0})\right)\\
& \leq C_1'\left(\max{\{M^2,\sigma^2\}}D_m\frac{\log^2(n)}{n}\zeta^{-k} + 4^\alpha\cdot\frac{C^2}{(m!)^2}\cdot\zeta^{2\alpha}\Delta^{2\alpha}_\mathcal{X}\right)\\
& \leq C_1'\left(\max{\{M^2,\sigma^2\}}D_m\frac{C^2\Delta^{2\alpha}_\mathcal{X}}{\Delta^{2\alpha}_\mathcal{Y}}\cdot\zeta^{2\alpha} + 4^\alpha\cdot\frac{C^2}{(m!)^2}\cdot \Delta^{2\alpha}_\mathcal{X}\zeta^{2\alpha}\right)\\
& \leq C_1'\left(\max{\{M^2,\sigma^2\}}\frac{D_m}{\Delta^{2\alpha}_\mathcal{Y}} + \frac{4^\alpha}{(m!)^2}\right)\cdot C^2\Delta^{2\alpha}_\mathcal{X}\zeta^{2\alpha}\\
\end{align}
\end{equation*}
}
%Recall that in the $\texttt{cross-validation}$ case,
 In the following we fix the at most $\log2n$ partitions $\mathcal{A}^k$ generated by {\em{adaptiveRPtree}} and we set
 \begin{center}
$Z_i^k=L(h_{\mathcal{A}^k})-L(h)- \Big(l(h_{\mathcal{A}^k},X_i',Y_i')- l(h,X_i',Y_i')\Big)$ 
\end{center}
for $i=1,...,n$ and $k=1,...,\log2n$.\\
Then $|Z_i|\leq L=4\Delta_\mathcal{Y}^2$ and note that the $\{Z_i\}_{i=1,...,n}$ are i.i.d and centered. Applying Bernstein's inequality, we have with probability at least \\$1-2\exp{(-t)}\cdot{\log2n}$ 
\begin{center}
$\begin{array}{lll}
|L(h_{\mathcal{A}^k})-L(h)- \big( \hat{L}(h_{\mathcal{A}^k})-\hat{L}(h) \big)|& \leq & \frac{2tL}{3n} +  \sqrt{2VarZ_i^k\cdot\frac{t}{n}}\\
 &\leq & \frac{2tL}{3n} +  VarZ_i^k\cdot\alpha + \frac{t}{n\alpha}
\end{array}$
\end{center}
for all $k=1,...,\log2n$ and $\alpha >0$  where the second inequality uses the Young's inequality.\\
Using $VarZ_i^k \leq L\cdot \big(L(h_{\mathcal{A}^k})-L(h)\big)$, choosing $\alpha = \frac{1}{2L}$ and setting \\
$\delta= 2\exp{(-t)}\cdot{\log2n}$; we have
\begin{equation}
|L(h_{\mathcal{A}^k})-L(h)- \big( \hat{L}(h_{\mathcal{A}^k})-\hat{L}(h) \big)| \leq \frac{1}{2}\big(L(h_{\mathcal{A}^k})-L(h)\big) +  2L\frac{2\log2n + \ln(1/\delta)}{n}
\end{equation}
with probability at least $1-\delta$.\\
Now if $\hat{h}= h_{\mathcal{A}^{\hat{k}}}$ is the empirical risk minimizer, then
\begin{center}
$\begin{array}{lll}
L(\hat{h})-L(h)& \leq & 2\Big(\hat{L}(\hat{h})-\hat{L}(h)\Big) +4L\frac{2\log2n + \ln(1/\delta)}{n}\\
 & \leq & 2\Big(\hat{L}({h_{\mathcal{A}^{i_0}}})-\hat{L}(h)\Big) +4L\frac{2\log2n + \ln(1/\delta)}{n}\\
 
 & \leq & 2\Big(\frac{3}{2}\Big(L({h_{\mathcal{A}^{i_0}}})-L(h)\Big) + 2L\frac{2\log2n + \ln(1/\delta)}{n}\Big) +4L\frac{2\log2n + \ln(1/\delta)}{n}\\
 & \leq & 3\Big(L({h_{\mathcal{A}^{i_0}}})-L(h)\Big) +8L\frac{2\log2n + \ln(1/\delta)}{n}
\end{array}$
\end{center}
where (8) is used for the first and the second inequalities.

 \hfill 
 $\square$




%\paragraph{Corollary 3.1 ($cross$-$validation$ option)} Consider the assumptions of Lemma 2.6 and according to Theorem 3.1,  the excess risk of the regressor is bounded by:
%\begin{center}
%$E_Y\|\hat{h}_{n,\mathcal{A}^*} - h\|^2 \leq \\C'_1\Big[\frac{(\log^2(n)+1)}{n}.M^2 + C^2\Delta^{2\alpha}_\mathcal{X}\left(\frac{D_n}{\Delta^{2\alpha}_\mathcal{Y}}+(4^\alpha +1)\frac{D^m}{(m!)^2} \right)\Big]\left(\frac{\Delta^{2\alpha}_\mathcal{Y}}{C^2\Delta^{2\alpha}_\mathcal{X}}.\frac{\sigma^2}{n}\right)^{\frac{2\alpha}{2\alpha+k}}\\ + 2\Delta_{\mathcal{Y}}^2\sqrt{\frac{\log(logn) + \log{4/\delta}}{2n}} $
%\end{center} 
%for some universal constant $C'_1 \in \mathcal{R}$.\\

%$Proof$: follows direct from Corollary 2.1 and Theorem 3.1.\\

\paragraph{Corollary 3.2 ({\em{automatic stopping}} option)} Consider the assumptions of Lemma 5.5 and according to Theorem 3.1,  the excess risk of the regressor is bounded by

\begin{equation*}
\begin{array}{lll}
E_Y\|\hat{h}_{n,\mathcal{A}^*} - h\|^2 & & \\  
\quad \quad \quad \quad \quad \leq C'_2\left(\max\{\sigma^2,M^2\}(m+1)^D + \frac{C^2}{(m!)^2}\right)\left(\Delta^{2\alpha}_\mathcal{X} + 1\right)(\frac{\log^2n}{n})^{\frac{2\alpha}{2\alpha +k}} & &
\end{array}
\end{equation*}

for some constant $C'_2 \in \mathcal{R}$ depending of $\alpha$.\\
%$k$, $M$, $\sigma^2$, $m$, $C$ and $D$.\\

{\em{Proof}}: follows direct from theorem 3.1, Lemma 3.10.
%  and $|\mathcal{A}^*|< 2^k.\zeta^{-k}$
 \clearpage

\appendix

\addcontentsline{toc}{chapter}{Appendix}
\chapter{Appendix}
\vspace{1cm}

{\em{Proof of Theorem 3.1}}: It holds:

\begin{equation*}
E_Y\|\hat{h}_{n,\mathcal{A}} - h\|^2 = E_Y\big(\sum\limits_{A\in\mathcal{A}}\int_A|\hat{h}_{n,A}-h.1_A|^2d\mu \big) = E_Y\big(\sum\limits_{A\in \mathcal{A}}\|\hat{h}_{n,A} -h.1_A\|^2 \big)
\end{equation*}
and
\begin{equation*}
\begin{array}{lll}
\|\hat{h}_{n,A} -h.1_A\|^2 & = & \big(\|\hat{h}_{n,A} -h.1_A\| - 2\|\hat{h}_{n,A} -h.1_A\|_n + 2\|\hat{h}_{n,A} -h.1_A\|_n\big)^2\\
& \leq & \big(\max\{\|\hat{h}_{n,A} -h.1_A\| - 2\|\hat{h}_{n,A} -h.1_A\|_n,0\} + 2\|\hat{h}_{n,A} -h.1_A\|_n\big)^2\\
& \leq & 2\left(\max\{\|\hat{h}_{n,A} -h.1_A\| - 2\|\hat{h}_{n,A} -h.1_A\|_n,0\}\right)^2 + 8\|\hat{h}_{n,A} - h.1_A\|_n^2\\
& = & Z_1(A) + Z_2(A)
\end{array}
\end{equation*}
then
\begin{center}
$E_Y\|\hat{h}_{n,\mathcal{A}} - h\|^2 \leq E_Y\left(\sum\limits_{A\in\mathcal{A}}Z_1(A)\right) + E_Y\left(\sum\limits_{A\in\mathcal{A}}Z_2(A)\right)$
\end{center}
Since $h$ is bounded on  $\mathcal{X}$ by $M$, it follows that $h.1_A$ is also bounded by $M$ for every $A\in \mathcal{A}$.
%Also $\sup\limits_{A\in\mathcal{A}} \|h.1_A\|_\infty \leq M$.\\
This implies
\begin{center}
$\|\hat{h}_{n,A} - h.1_A\|_n^2 \leq \|h_{n,A} -h.1_A\|_n^2.$
\end{center}
and:
\begin{equation*}
\begin{array}{lll}
E_Y\big(\sum\limits_{A\in\mathcal{A}}Z_2(A)\big) & \leq & 8E_Y\|h_{n,\mathcal{A}} - h\|_n^2\\
& \leq & \tilde{C_2}\left(|\mathcal{A}| \frac{\sigma^2}{n}\cdot D_m + \frac{C^2}{(m!)^2}\cdot \Delta_n^{2\alpha}(\mathcal{A})\right)  (*)\\
\end{array}
\end{equation*}

It remains now to upper bound $E_Y\left(\sum\limits_{A\in\mathcal{A}}Z_1(A)\right)$. Let $z>\frac{576M^2}{n}$ be arbitrary. Then, by lemma 3.9,

{\small
\begin{equation*}
\begin{array}{lll}
P\left(Z_1(A)>z\right) & = & P\left(2\left(\max\{\|\hat{h}_{n,A} - h.1_A\| - 2\|\hat{h}_{n,A} -h.1_A\|_n,0\}\right)^2 > z \right)\\
& = & P\left(\max\{\|\hat{h}_{n,A} -h.1_A\| - 2\|\hat{h}_{n,A} - h.1_A\|_n,0\} > \sqrt{z/2} \right)\\
& \leq & P\left(\exists h_0 \in T_M H_{m,A} :\|h_0 -h.1_A\| - 2\|h_0 - h.1_A\|_n > \sqrt{z/2} \right)\\
& \leq & P\left(\exists h_0 \in T_M H_{m,\mathcal{C}_n} :\|h_0 -h.1_A\| - 2\|h_0 - h.1_A\|_n > \sqrt{z/2} \right)\\
& \leq & 3E\mathcal{N}_2\left(\frac{\sqrt{z}}{24}, T_M H_{m,\mathcal{C}_n}, X_1^{2n}\right)\exp\left(-\frac{n.z}{576(2M)^2} \right)\\
& \leq & 3E\mathcal{N}_2\left(\frac{M}{\sqrt{n}}, T_M H_{m,\mathcal{C}_n}, X_1^{2n}\right)\exp\left(-\frac{n.z}{576(2M)^2} \right)\\
& \leq & 9(768^2\cdot e)^{2.m_n}\cdot n^{12.m_n}\cdot m_n^{2.m_n}\log(3n/\delta)^{2.m_n}\cdot\exp\left(-\frac{n.z}{2304M^2}\right)\\
\end{array}
\end{equation*}
}
where the last inequality uses the following:

\begin{equation*}
\begin{array}{lll}
\mathcal{N}_2\left(\frac{M}{\sqrt{n}}, T_M H_{m,\mathcal{A}}, X_1^{2n}\right) & & \\
\quad \leq \mathcal{M}_2\left(\frac{M}{\sqrt{n}}, T_M H_{m,\mathcal{A}}, X_1^{2n}\right) & & \\
\quad \leq 3\big(32n\sqrt{e.m_n.\log(3n/\delta)}\cdot 16n^2\cdot\log(48n\sqrt{e.m_n.\log(3n/\delta)}\cdot 16n^2) \big)^{2.m_n} & & \\
\quad  <   3\big(768n^3\cdot\sqrt{e.m_n.\log(3n/\delta)}\cdot\log(768n^3\cdot\sqrt{e.m_n.\log(3n/\delta)}) \big)^{2.m_n} & & \\
\quad \leq 3\big((768n^3)^2\cdot .e.m_n.\log(3n/\delta) \big)^{2.m_n} & & \\
\quad = 3\big((768^2\cdot e)\cdot n^6\cdot m_n\cdot\log(3n/\delta) \big)^{2.m_n} & & \\
%\quad = (12).1024^{4.max}n^{2+12.max}\left(e.max.\log(3n/\delta)\right)^{2.max} & & \\
\end{array}
\end{equation*}
Here the first inequality uses Lemma 3.1 and the second uses Lemma 3.8.\\

Set  $z_n := \frac{C_0\cdot M^2\cdot D_m\log^2(n)}{n}\quad \quad$ and
\begin{equation*}
\delta_0 := 9(768^2\cdot e)^{2.m_n}\cdot n^{12.m_n}\cdot m_n^{2.m_n}\log(3n/\delta)^{2.m_n}\cdot\exp\left(-\frac{n.z}{2304M^2}\right).
\end{equation*}
Then:
\begin{equation*}
\exp\left(-\frac{n.z_n}{2304M^2}\right) = \exp\left(-\frac{C_0\cdot D_m\log^2(n)}{2304}\right) =  \exp(- C'_0\cdot D_m\log^2(n)) = n^{-C'_0\cdot D_m\log(n)}
\end{equation*}
and $E\left(P_Y(\sup Z_1 \geq z) \right) = P(\sup Z_1 \geq z) \leq \delta_0$.

If ones chooses $C'_0$ such that $n\cdot\delta_0 \leq \frac{1}{n}$, then by the Markov inequality we have

\begin{equation*}
P_X\left[P_Y(\sup Z_1 \geq z_n) > \frac{1}{n} \right] \leq \frac{\delta_0}{\frac{1}{n}} \leq \frac{1}{n} \leq \delta .
\end{equation*}

Hence

\begin{equation*}
\begin{array}{lll}
E_Y\big(\sum\limits_{A\in\mathcal{A}}Z_1\big) &\leq & E_Y\big(\sum\limits_{A\in\mathcal{A}} Z_1(A).\texttt{1}_{\{\sup Z_1 \leq z_n\}}\big) + E_Y\big(\sum\limits_{A\in\mathcal{A}} Z_1(A).\texttt{1}_{\{\sup Z_1 \geq z_n\}}\big)\\
& \leq & C_0\cdot\frac{\log^2(n)}{n}\cdot M^2\cdot D_m\cdot E_Y(|\mathcal{A}|) + 8.M^2E_Y(|\mathcal{A}|)P_Y(\sup Z_1 \geq z_n) \\
& = & C_0\cdot\frac{\log^2(n)}{n}\cdot M^2\cdot D_m\cdot |\mathcal{A}| + 8.M^2.|\mathcal{A}|P_Y(\sup Z_1 \geq z_n) \\
& \leq & C_0\cdot\frac{\log^2(n)}{n}\cdot M^2\cdot D_m\cdot |\mathcal{A}| + 8.M^2.|\mathcal{A}|.\frac{1}{n} \\
& \leq & C_0\cdot\frac{\log^2(n)+1}{n}\cdot M^2\cdot D_m\cdot |\mathcal{A}|  \\
\end{array}
\end{equation*}
where the third inequality holds with probability at least $1-\delta$. \hfill $\square$\\

%{\em{Proof}} of Lemma 2.2: For any fix $z_0 \in A$, the line segment [$z_0,z$] lie in $A$ for any $z \in A$ ($A$ convex). Then we can write the Taylor formula of $h$:
%\begin{center}
%$
%h(z) = \sum\limits_{|\beta|\leq m-1}\left(\frac{\partial^\beta h(z_0)}{\beta!}\right)(z-z_0)^\beta \\
% + m\sum\limits_{|\beta|=m}\left(\int_0^1 \frac{\partial^\beta h(z_0 + t(z-z_0))}{\beta!}(1-t)^{m-1}dt\right).(z-z_0)^\beta
%$
%\end{center}
%where
%$|\beta| := \beta_1+...+\beta_D$,
%$\beta! := \beta_1!...\beta_D!$,
%$(z-z_0)^\beta := (z_1-z_{0_1})^{\beta_1} ... (z_D-z_{0_D})^{\beta_D}$

%let $p_k$ be the Taylor polynomial of $h$ of degree $k$ around $z_0$:
%\begin{center}
%$p_k (z) = \sum\limits_{|\beta|\leq k}\left(\frac{\partial^\beta h(z_0)}{\beta!}\right)(z-z_0)^\beta $
%\end{center}

%For $h_0 := p_m$, we obtain:

%{\small
%\begin{equation*}
%\begin{align*} 
%& h(z)-h_0(z) \\
%& = h(z) - \sum\limits_{|\beta|\leq m-1}\left(\frac{\partial^\beta h(z_0)}{\beta!}\right)(z-z_0)^\beta - \sum\limits_{|\beta|= m}\left(\frac{\partial^\beta h(z_0)}{\beta!}\right)(z-z_0)^\beta \\
%  & = m\sum\limits_{|\beta|=m}\left(\int_0^1 \frac{\partial^\beta h(z_0 + t(z-z_0))}{\beta!}(1-t)^{m-1}dt\right)(z-z_0)^\beta \\
%& \quad - \sum\limits_{|\beta|= m}\left(\frac{\partial^\beta h(z_0)}{\beta!}\right)(z-z_0)^\beta \\
%  & = \sum\limits_{|\beta|=m}\left(\int_0^1 m\frac{\partial^\beta h(z_0 + t(z-z_0))(1-t)^{m-1} - \partial^\beta h(z_0)}{\beta!}dt\right)(z-z_0)^\beta \\
%  & = \sum\limits_{|\beta|=m}\left(\int_0^1 m\frac{\left(\partial^\beta h(z_0 + t(z-z_0)) - \partial^\beta h(z_0)\right)(1- t)^{m-1}}{\beta!}dt\right)(z-z_0)^\beta \\
%  & \leq \sum\limits_{|\beta|=m}\left(\int_0^1 \frac{m}{\beta!}C\|t(z-z_0)\|^r (1- t)^{m-1}dt\right)(z-z_0)^\beta \\
%  & \leq C\|z-z_0\|^r \sum\limits_{|\beta|=m} \frac{1}{\beta!}\left(\int_0^1 m(1-t)^{m-1}dt\right)(z-z_0)^\beta \\
%  & = C\|z-z_0\|^r \sum\limits_{|\beta|=m} \frac{1}{\beta!}(z-z_0)^\beta \\
%  & = C\|z-z_0\|^r \frac{1}{m!}\sum\limits_{|\beta|=m} \frac{|\beta|!}{\beta!} |z_1-z_{0_1}|^{\beta_1}...|z_D-z_{0_D}|^{\beta_D} \\
%  & = C\|z-z_0\|^r \frac{1}{m!}\left( |z_1-z_{0_1}| +...+ |z_D-z_{0_D}|\right)^m \\
%  & \leq C\|z-z_0\|^r \frac{1}{m!}\left(\sqrt{D}\|z-z_0\|\right)^m \\
%  & \leq \frac{1}{m!}C .D^{\frac{m}{2}}\|z-z_0\|^{m+r} \\
%  & \leq \frac{1}{m!} D^{\frac{m}{2}} C.\Delta^\alpha(A) \\ 
%\end{align*} 
%\end{equation*} \hfill $\square$ \\
%}

%where the first inequality uses the $(\alpha,C)$-$smooth$ assumption of $h$ and the third inequality uses the Cauchy-Schwartz inequality.\\

{\em{Proof of Lemma 3.8}} :Based on ideas similar to the proof of Theorem 9.4 [GKKW02],we first prove the lemma for $p=1$.\\
Then let $p=1$, ${l = \lfloor \frac{2M}{\epsilon}\log\left(2\mathcal{M}(\epsilon, T_M H_{m,\mathcal{C}_n},\|.\|_{L_1(\mu)}) \right) \rfloor}$ and let \\$m_n := \max\{D_m +1,k\}$, It is:

{\small
\begin{equation*}
\begin{array}{lll}
\mathcal{M}(\epsilon, T_M H_{m,\mathcal{C}_n},\|.\|_{L_1(\mu)})& \leq & 3 S(T_M H^+_{m,\mathcal{C}_n},l) \\
 & \leq & 3 S(H^+_{m,\mathcal{C}_n},l) \\
 & \leq & 3 S(H_{m,A}^+,l)\cdot S(\mathcal{C}_n,l) \quad \quad \quad \quad \quad \quad \quad \quad \quad \quad (Lemma \quad 3.2) \\
 & \leq & 3 (l+1)^{V_{\mathcal{H}_{m,A}^+}}\cdot(2l|\mathcal{P}|+1)^k \quad \quad (Lemma \quad 3.5 \quad and \quad 3.6) \\
 & \leq & 3 (l+1)^{D_m +1}\cdot (4l|\mathcal{P}|)^k \quad \quad \quad \quad \quad \quad \quad \quad \quad (Lemma \quad 3.4) \\
 & \leq & 3 (l+1)^{m_n}\cdot (4l|\mathcal{P}|)^{m_n} \\
 & \leq & 3 \left(\frac{el}{m_n}\right)^{m_n}\cdot(4l|\mathcal{P}|)^{m_n} \quad\quad \quad (Lemma \quad 3.6 \quad  for \quad l \geq m_n)\\
 & \leq & 3 \left(\frac{4e|\mathcal{P}|l^2}{m_n}\right)^{m_n} \\
 & \leq & 3 \left(\frac{4e|\mathcal{P}|}{m_n}\cdot\frac{(2M)^2}{\epsilon^2}\log^2\left(2\mathcal{M}(\epsilon, T_M H_{m,\mathcal{C}_n},\|.\|_{L_1(\mu)}) \right)\right)^{m_n}\\
 & \leq & 3 \left(\frac{4\sqrt{e|\mathcal{P}|}}{2\sqrt{m_n}}\cdot\frac{2M}{\epsilon}\log\left(2\mathcal{M}(\epsilon, T_M H_{m,\mathcal{C}_n},\|.\|_{L_1(\mu)}) \right)\right)^{2m_n}\\
 & \leq & 3 \left(\frac{8M\sqrt{e|\mathcal{P}|.m_n}}{\epsilon}\cdot\frac{1}{2m_n}\log\left(2\mathcal{M}(\epsilon, T_M H_{m,\mathcal{C}_n},\|.\|_{L_1(\mu)}) \right)\right)^{2m_n}\\
\end{array}
\end{equation*}
}

Now if one sets $x = \mathcal{M}(\epsilon, T_M H_{m,\mathcal{C}_n},\|.\|_{L_1(\mu)})$, $a = \frac{8M\sqrt{e|\mathcal{P}|\cdot m_n}}{\epsilon}$ and 
$b = 2m_n$, then $a\geq e$ and $b \geq 2$, and the last inequality is equivalent to
\begin{center}
$x \leq 3\left(\frac{a}{b}\log(2x)\right)^b.$
\end{center}
This implies
\begin{center}
$x \leq 3\left(2a\log(3a)\right)^b,$
\end{center}
(see also proof of Theorem 9.4 in [GKKW02]). If we replace $a$ and $b$ and the upper bound of $|\mathcal{P}|$ we get:
\begin{equation*}
\begin{array}{lll}
x &\leq &3\left(8\sqrt{e|\mathcal{P}|\cdot m_n}\cdot\frac{2M}{\epsilon}\log(12\sqrt{e.|\mathcal{P}|\cdot m_n}\cdot\frac{2M}{\epsilon})\right)^{2m_n} \\
  &\leq & 3\left(8n\sqrt{8e\cdot m_n\cdot\log(3n/\delta)}\cdot\frac{2M}{\epsilon}\log(12n\sqrt{8e\cdot m_n\cdot\log(3n/\delta)}\cdot\frac{2M}{\epsilon})\right)^{2m_n}\\
  &\leq & 3\left(32n\sqrt{e\cdot m_n\cdot\log(3n/\delta)}\cdot\frac{2M}{\epsilon}\log(48n\sqrt{e\cdot m_n\cdot\log(3n/\delta)}\cdot\frac{2M}{\epsilon})\right)^{2m_n}.
\end{array}
\end{equation*}

But if $l = \lfloor \frac{2M}{\epsilon}\log\left(2\mathcal{M}(\epsilon, T_M H_{m,\mathcal{C}_n},\|.\|_{L_1(\mu)}) \right) \rfloor \leq m_n$, then

\begin{equation*}
\begin{array}{lll}
\mathcal{M}(\epsilon, T_M H_{m,\mathcal{C}_n},\|.\|_{L_1(\mu)}) & & \\
\quad \quad \leq \frac{1}{2}\exp\left(\frac{\epsilon(m_n +1)}{2M}\right) & & \\
\quad \quad \leq \frac{e}{2}\exp(2m_n) \quad \quad \quad \quad \quad \quad \quad \quad \quad \quad \quad\quad\quad\text{ since }\quad \quad 0< \epsilon \leq M/2 & & \\
\quad \quad \leq 3\left(32n\sqrt{e\cdot m_n\cdot\log(3n/\delta)}\cdot\frac{2M}{\epsilon}\log(48n\sqrt{e\cdot m_n\cdot\log(3n/\delta)}\cdot\frac{2M}{\epsilon})\right)^{2m_n} & &
\end{array}
\end{equation*}

It remains at moment to show the case $1 <p < \infty$. Let $h_i$, $h_j \in T_M H_{m,\mathcal{C}_n}$. Since

\begin{center}
$\|h_i - h_j\|_{L_p(\mu)}^p \leq (2M)^{p-1}\|h_i - h_j\|_{L_1(\mu)}$
\end{center}
every {\em{$L_p$ $\epsilon$-packing}} of $T_M H_{m,\mathcal{C}_n}$ is also a $L_1$ $(\frac{\epsilon^p}{(2M)^{p-1}})$-$packing$ of $T_M H_{m,\mathcal{C}_n}$. This together with the result for $p=1$ implies :

\begin{equation*}
\begin{array}{lll}
\mathcal{M}(\epsilon, T_M H_{m,\mathcal{C}_n},\|.\|_{L_p(\mu)}) & & \\
\leq  \mathcal{M}(\frac{\epsilon^p}{(2M)^{p-1}}, T_M H_{m,\mathcal{C}_n},\|.\|_{L_1(\mu)}) & & \\
\leq 3\left(32n\sqrt{e.m_n.\log(3n/\delta)}.\frac{2M}{\epsilon^p/(2M)^{p-1}}\log(48\sqrt{e.m_n.\log(3n/\delta)}\frac{2M}{\epsilon^p/(2M)^{p-1}}) \right)^{2m_n} & & \\
\leq 3\left(32n\sqrt{e.m_n.\log(3n/\delta)}.\frac{(2M)^p}{\epsilon^p}\log(48n\sqrt{e.m_n.\log(3n/\delta)}\cdot\frac{(2M)^p}{\epsilon^p}) \right)^{2m_n} & & 
\end{array}
\end{equation*} 


\hfill $\square$




\end{document}